  # Cleans the LISS data

# load libraries
library(data.table)
library(tidyverse)
library(haven)
library(dplyr)

data <- LISS_data

# Rename
data <- data %>% rename(worked = q1, lastjob = q2)

# Remove No Recent Job
norecentjob <- data %>% filter(lastjob < 2015 | worked == 0) %>% nrow()
data <- data %>% filter(!(lastjob < 2015 & !is.na(lastjob)))
data <- data %>% filter(!(worked == 0 & !is.na(worked)))
  
# Drop if Formally Retired or Too Old (66+)
data <- data %>% filter(!belbezig_p %in% 9)
data <- data %>% filter(leeftijd <= 65)

# Drop if Not Finished Complete Survey
data <- data %>% filter(!(is.na(DatumE) | DatumE == ""))
  
# Merge in Background variables
background_data <- read_dta(file.path(dir, "DataCode/Data/Raw/LISS/background_202106.dta"))
data <- merge(data, background_data, by = "nomem_encr", all = FALSE)
  
# Merge in Sectors & Occupations
work_data <- read_dta(file.path(dir, "DataCode/Data/Raw/LISS/work_2021.dta"))
data <- merge(data, work_data, by = "nomem_encr", all = FALSE)
  
# Renaming some variables (the parameters of the DCE)
data <- data %>%
  rename(
    workhoursA = c_WH_A, whB1 = c_WH_B, whB5 = c_WH_B1, whB6 = c_WH_B2, whB7 = c_WH_B3,
    flexibilityA = c_FL_A, flB2 = c_FL_B, flB6 = c_FL_B1, flB8 = c_FL_B2, flB9 = c_FL_B3,
    WTPflex = c_FL_WTP, telecommuteA = c_TC_A, telecommuteB = c_TC_B, meaningA = c_MW_A,
    meaningB = c_MW_B, ywageA = c_W_A, monthlywageA = c_W_Am,
    hrlywageA = c_HW_A
  )

############################
### Cleaning Base Sample ###
############################
  
# Married Dummy
data <- data %>% mutate(married = ifelse(burgstat.x == 1, 1, 0))
data <- data %>% filter(!is.na(burgstat.x))
  
# Age controls
colnames(data)[colnames(data) == "leeftijd.x"] <- "leeftijd"
data <- data %>% mutate(leeftijd2 = leeftijd^2, leeftijd3 = leeftijd^3)

# Construct Years of Education variable - 
set.seed(0)  # to replicate draws from runiform
  
data$education <- ifelse(data$cw21n005 %in% c(1, 2, 4) | data$cw21n006 == 1, sample(1:7, length(data$cw21n005), replace = TRUE), NA)
data$education[data$cw21n005 %in% c(1, 2, 4) | data$cw21n006 == 1] <- 8  # less than primary school OR in special education
data$education[data$cw21n005 %in% 3 | data$cw21n006 == 2] <- 10  # primary school
data$education[data$cw21n005 %in% c(5, 7)] <- sample(9:11, 1)  # vglo or huishoudschool 
data$education[data$cw21n005 %in% 8] <- runif(1, 9, 11)  # ulo/mulo/mavo
data$education[data$cw21n005 %in% c(6, 9, 10)] <- 12  # lbo or vmbo
data$education[data$cw21n005 %in% c(11, 12, 13) | data$cw21n006 == 4] <- 13  # mms or havo
data$education[data$cw21n005 %in% c(14, 15, 16)] <- 14  # VWO or Gymnasium
data$education[data$cw21n005 %in% c(17, 18, 19) | data$cw21n006 == 5] <- 16  # middle professional education
data$education[data$cw21n005 %in% c(20, 21, 22, 23, 24, 25) | data$cw21n006 == 6] <- 17  # upper professional or academic bachelor
data$education[data$cw21n005 %in% 26 | data$cw21n006 == 7] <- sample(18:19, 1)  # master degree
data$education[data$cw21n005 %in% 27] <- sample(23:25, 1)  # phd+
  
# Gender variable
data <- data %>% mutate(woman = ifelse(geslacht.x == 2, 1, 0))

# Children variables
data <- data %>% mutate(nrchild_hh = case_when(a1 == 1 ~ 1, a1 == 2 ~ 2, a1 >= 3 ~ 3))
data <- data %>% mutate(nrchild_hh = if_else(is.na(nrchild_hh), 0, nrchild_hh))
  
# Children variable
data <- data %>% mutate(children = if_else(nrchild_hh>0, 1, 0))
  
# Children under 5
data <- data %>% mutate(child5 = ifelse(b1 < 5 | c1 < 5 | c2 < 5 | c3 < 5 | c4 < 5 | c5 < 5 | c6 < 5 | c7 < 5 | c8 < 5 | c9 < 5, 1, 0))
data <- data %>% mutate(child5 = if_else(is.na(child5), 0, child5))
  
# Hours
data <- data %>%
    mutate(hours = rowSums(select(., q3a, q3b), na.rm = TRUE)) %>%
    mutate(hours = ifelse(hours > 60, 60, hours))
  
# Part-Time: SPT - LPT - FT Indicator
data <- data %>%
  mutate(whcat = case_when(
    hours > 0 & hours < 32 ~ 0,
    hours >= 32 & hours < 38 ~ 1,
    hours >= 38 ~ 2
  ))
  
# Aggregate PT Indicator
data <- data %>% mutate(part_time = ifelse(hours >= 36, 0, 1))

# Schedule Adaptability
data <- data %>% rename(flex = q4a, flex_precov = q4b, flex_last = q4c)
data <- data %>% mutate(flex_cov = rowSums(select(., flex_precov, flex_last), na.rm = TRUE))
data <- data[data$flex_cov != 4, ]
data <- data %>% mutate(covid_flex = flex - flex_cov) # check: change around covid
  
# Telecommuting
data <- data %>% rename(tel = q5, tel_precov = q6, tel_last = q5b)
data <- data %>% mutate(tel_cov = rowSums(select(., tel_precov, tel_last), na.rm = TRUE))
data <- data %>% mutate(covid_tel = tel - tel_cov) # check: change around covid
  
  
# Meaning
data <- data %>% mutate(meaning = rowSums(select(., q7a, q7b), na.rm = TRUE))
  
# Dummies for Work Meaning and Workplace Flexibility
data <- data %>%
  mutate(high_flex = ifelse(flex_cov %in% 2:3, 1, 0),
         high_meaning = ifelse(meaning %in% 2:3, 1, 0),
         high_telecommute = ifelse(tel_cov %in% 2:3, 1, 0))

data <- data %>%
  mutate(high_flex_aftcov = ifelse(flex %in% 2:3, 1, 0),
         high_telecommute_aftcov = ifelse(tel %in% 2:3, 1, 0))

# Contact and Computer Use
data$contact <- rowSums(data[, c("bg1", "bg1b")], na.rm = TRUE)
data$high_contact <- data$contact > 3
  
data$computer <- data$bg2 / 100
p50 <- median(data$computer, na.rm = TRUE)
data$high_computer <- data$computer > p50
  
# Wages
# First: combine the bracket and integer wages for current workers
data <- data %>% mutate(q9a = ifelse(q9a < 0, NA, q9a))
data <- data %>% mutate(q10a = as.numeric(q10a), q10a_recoded = recode(q10a, `0` = 325, `1` = 975, `2` = 1650, `3` = 2500, `4` = 3500, `5` = 5000, `6` = 12500))
data <- data %>% mutate(wage_current = if_else(is.na(q9a) & is.na(q10a_recoded), NA_real_, rowSums(select(., q9a, q10a_recoded), na.rm = TRUE)))
  
# Then: combine bracket and integer wages for those not working
data <- data %>% mutate(q9b = ifelse(q9b < 0, NA, q9b))
data <- data %>% mutate(q10b = as.numeric(q10b), q10b_recoded = recode(q10b, `0` = 325, `1` = 975, `2` = 1650, `3` = 2500, `4` = 3500, `5` = 5000, `6` = 12500))
data <- data %>% mutate(wage_unempl = if_else(is.na(q9b) & is.na(q10b_recoded), NA_real_, rowSums(select(., q9b, q10b_recoded), na.rm = TRUE)))
    
# Finally: combine workers and non-workers
data <- data %>% mutate(wage = rowSums(select(., wage_current, wage_unempl), na.rm = TRUE))
  
# Calculate Hourly Wages
data <- data %>% mutate(hourwage = wage / (hours * (52/12)))

# Clean separately for men and women
perc <- quantile(data$hourwage, probs = c(0.025, 0.975), type = 7) # type 7 = same as Stata
data <- data %>% filter(!(hourwage <= perc[1]) & !(hourwage >= perc[2]))
  
# Construct Quartiles
data$wage_q <- cut2(data$wage, g = 10,  labels=FALSE)
levels(data$wage_q) <- 1:10
  
# Take log wages
data <- data %>% mutate(ln_hourwage = log(hourwage), ln_wage = log(wage))
  
# Sectors
data <- data %>% rename(sector = cw21n402) 

# Final data selection
data <- data %>%
  select(
    telecommuteA,workhoursA, flexibilityA, meaningA, hrlywageA,  monthlywageA, wage_B1, wage_B2, wage_B3, wage_B4, wage_B5, wage_B6, wage_B7, wage_B8, wage_B9, wage_B10, hce1, hce2, hce3, hce4, hce5, hce6, hce7, hce8, hce9, hce10,
    workhoursA, whB1, whB5, whB6, whB7, flexibilityA, flB2, flB6, flB8, flB9, WTPflex, telecommuteA, telecommuteB, meaningA, meaningB, ywageA, monthlywageA, hrlywageA,
    woman, leeftijd, leeftijd2, leeftijd3, married, education, nrchild_hh, children, child5, nomem_encr, high_flex_aftcov, high_telecommute_aftcov, duur,
    hours, whcat, high_flex, high_meaning, high_telecommute, high_contact, high_computer, wage, hourwage, ln_hourwage, wage_q, lastjob, sector)

  
clean_LISS_data <- data
rm("data", "background_data", "work_data")